TAXTAB <- data.frame(as(tax_table(psSubj), "matrix")) %>%
rownames_to_column("Seq_ID") %>%
select(-Seq) %>%
mutate(
OrgName.RDP = paste(Genus, Species),
OrgName.Silva = paste(GenusSilva, SpeciesSilva),
OrgName = ifelse(grepl("NA", OrgName.RDP) & !grepl("NA", OrgName.RDP),
OrgName.Silva, OrgName.RDP),
OrgName = ifelse(OrgName == "NA NA", "NA", OrgName),
OrgName = paste0(Seq_ID, ": ", OrgName)) %>%
select(1:8, OrgName)
head(TAXTAB)
Seq_ID Kingdom Phylum Class Order Family Genus Species
1 Seq1 Bacteria Firmicutes Clostridia Clostridiales Ruminococcaceae Faecalibacterium prausnitzii
2 Seq2 Bacteria Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides vulgatus
3 Seq3 Bacteria Firmicutes Clostridia Clostridiales Ruminococcaceae Faecalibacterium prausnitzii
4 Seq4 Bacteria Bacteroidetes Bacteroidia Bacteroidales Prevotellaceae Prevotella <NA>
5 Seq5 Bacteria Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides vulgatus
6 Seq6 Bacteria Firmicutes Clostridia Clostridiales Ruminococcaceae Faecalibacterium prausnitzii
OrgName
1 Seq1: Faecalibacterium prausnitzii
2 Seq2: Bacteroides vulgatus
3 Seq3: Faecalibacterium prausnitzii
4 Seq4: Prevotella NA
5 Seq5: Bacteroides vulgatus
6 Seq6: Faecalibacterium prausnitzii
load("output/pairwise_dist_to_baseline_subj_16S.rda")
load("output/fpca_clust_res_abx.rda")
load("output/fpca_clust_res_diet.rda")
load("output/fpca_res.rda")
ls()
[1] "abx_fclust" "abx_fclust_mu" "abx_fclust_subjFit" "abx_intv_cols"
[5] "abx.rsv.subset" "bray_to_baseline" "bray.df" "brayD.ihs"
[9] "cc_intv_cols" "curdir" "datadir" "diet_fclust"
[13] "diet_fclust_mu" "diet_fclust_subjFit" "diet_intv_cols" "diet.rsv.subset"
[17] "fpca.bray.abx" "fpca.bray.cc" "fpca.bray.cc30" "fpca.bray.diet"
[21] "fpca.bray.diet30" "intv_cols" "jacc_to_baseline" "jacc.df"
[25] "jaccardD" "keep_rsv" "psSubj" "SMP"
[29] "SUBJ" "TAXTAB" "theme_subplot" "uniFrac_to_baseline"
[33] "uniFrac.df" "uniFracD.lst"
source("./fpca_funs.R")
# this is because some subjects have multiple samples take the same day
bray_to_baseline_fltr <- bray_to_baseline %>%
group_by(perturbation, Subject, Group, Interval, RelDay) %>%
summarise(n = n(), dist_to_baseline = mean(dist_to_baseline)) %>%
ungroup()
bray_to_baseline_fltr %>% filter(n > 1)
bray_to_baseline_fltr %>%
filter(perturbation == "Abx") %>%
filter(RelDay >= -50, RelDay <= 60) %>%
mutate(Interval = factor(Interval, level = names(abx_intv_cols))) %>%
ggplot(
aes(x = RelDay, y = dist_to_baseline,
group = Subject, color = Interval)) +
geom_line(aes(group = Subject), alpha = 0.7, lwd = 0.5) +
geom_point(alpha = 0.5, size = 1.2) +
scale_color_manual(values = abx_intv_cols) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(size=3)))
# +
# xlab("Days from initial antibiotic dose") +
# ylab("Bray-Curtis distance to 7 pre-antibiotic samples")
fpca.bray.abx <- fit_dist_to_baseline(
bray_to_baseline_fltr %>% filter(
perturbation == "Abx", RelDay >= -50, RelDay <= 60))
save(list = c("fpca.bray.abx"), file = "output/fpca_res.rda")
(pAbx <- bray_to_baseline_fltr %>%
filter(perturbation == "Abx", RelDay >= -50, RelDay <= 60) %>%
ggplot(aes(x = RelDay, y = dist_to_baseline)) +
geom_line(
data = fpca.bray.abx[["fitted"]],
aes(group = Subject, x = time, y = value),
alpha = 0.3, size = 0.7, color = "grey30") +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_point(aes(color = Interval), size = 1.5, alpha = 0.7) +
geom_line(
data = fpca.bray.abx[["mean"]], aes(x = time, y = value),
color = "navy", size = 2) +
scale_color_manual(values = abx_intv_cols, name = "Interval") +
scale_x_continuous(
name = "Days from initial antibiotic dose",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
scale_y_continuous(
name = "Bray-Curtis distance to baseline",
limits = c(0.1, 0.85), breaks = seq(0.1, 0.80, 0.1)) +
theme(text = element_text(size = 20)))
fpca.bray.abx[["fitted"]] <- fpca.bray.abx[["fitted"]] %>%
mutate(
Interval = ifelse(fpca.bray.abx[["fitted"]]$time < 0 , "PreAbx",
ifelse(fpca.bray.abx[["fitted"]]$time >= 0 & fpca.bray.abx[["fitted"]]$time <= 4, "MidAbx",
"PostAbx")))
(pAbxDeriv <- fpca.bray.abx[["fitted"]] %>%
ggplot(aes(x = RelDay)) +
geom_line(
aes(group = Subject, x = time, y = deriv, color = Interval),
alpha = 0.5, size = 0.7) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
scale_color_manual(values = abx_intv_cols, name = "Interval") +
scale_x_continuous(name = "",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
ylab("Derivative"))
(pAbxLab <- bray_to_baseline_fltr %>%
filter(perturbation == "Abx", RelDay >= -50, RelDay <= 60) %>%
ggplot(aes(x = RelDay, y = dist_to_baseline)) +
geom_line(
data = fpca.bray.abx[["fitted"]],
aes(group = Subject, x = time, y = value),
alpha = 0.3, size = 0.7 ) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_text(
aes(label = Subject, color = Interval), size = 4, alpha = 0.7) +
geom_line(
data = fpca.bray.abx[["mean"]], aes(x = time, y = value),
color = "navy", size = 2) +
scale_color_manual(values = abx_intv_cols, name = "Interval") +
scale_x_continuous(
name = "Days from initial antibiotic dose",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
scale_y_continuous(
name = "Bray-Curtis distance to baseline",
limits = c(0.1, 0.85), breaks = seq(0.1, 0.80, 0.1)) +
theme(text = element_text(size = 20)))
# geom_point(
# data = abx_bray %>% filter(RelDay > StabTime),
# color = "orange", size = 2.5) +
vp = grid::viewport(width = 0.3, height = 0.32, x = 0.07, y =0.95, just = c("left", "top"))
print(pAbx)
print(pAbxDeriv + theme_bw(base_size = 10) + theme_subplot, vp = vp)
abxFac <- c("PreAbx", "MidAbx", "PostAbx")
abx <- subset_samples(psSubj, Abx_RelDay >= -50 & Abx_RelDay <= 60)
abx <- subset_taxa(abx, taxa_sums(abx) > 0)
abx
(replicated <- data.frame(sample_data(abx)) %>%
group_by(Subject, Group, Interval, Abx_RelDay) %>%
mutate(n = n()) %>%
ungroup() %>% filter(n > 1))
# Remove replicated samples
sample_sums(abx)[replicated$Meas_ID]
# M7869 M7886
# 74697 74873
# we retain a replicate with higher sample depth:
abx <- subset_samples(abx, !Meas_ID %in% c("M2243", "M7869"))
abx
#otu_table() OTU Table: [ 2339 taxa and 1418 samples ]
# Asinh the data so that the counts are comparable
abx.rsv <- data.frame(asinh(as(otu_table(abx), "matrix"))) %>%
rownames_to_column("Seq_ID") %>%
gather(Meas_ID, abundance, -Seq_ID) %>%
left_join(SMP) %>%
left_join(TAXTAB) %>%
mutate(Abx_Interval = factor(Abx_Interval, levels = abxFac)) %>%
arrange(Seq_ID, Subject, Abx_RelDay)
thresh <- 0; num_nonzero <- 10;
min_no_subj <- 3
keep_rsv <- abx.rsv %>%
filter(abundance > thresh) %>%
group_by(Subject, Seq_ID) %>%
summarise(Freq = n()) %>% # No. of samples per subject w/ positive rsv count
filter(Freq >= num_nonzero) %>%
group_by(Seq_ID) %>%
summarise(Freq = n()) %>% # Number of subjects w/ num_nonzero samples with counts > thresh
filter(Freq >= min_no_subj) %>%
arrange(desc(Freq))
length(unique(keep_rsv$Seq_ID)) # = 839
abx.rsv.subset <- abx.rsv %>%
filter(Seq_ID %in% unique(keep_rsv$Seq_ID))
#save(list = c("keep_rsv", "abx.rsv.subset"), file = "results/fpca_clust_res_abx.rda")
abx_fclust <- fpca_wrapper(
abx.rsv.subset,
time_column = "Abx_RelDay",
value_column = "abundance",
replicate_column = "Subject",
feat_column = "Seq_ID",
cluster = TRUE,
clust_min_num_replicate = 15,
fpca_optns = NULL, fclust_optns = NULL,
parallel = TRUE, ncores = 16)
save(list = c("abx_fclust"),
file = "results/abx_rsv_fclust.rda")
abx_fclust_mu <- get_fpca_means(abx_fclust)
abx_fclust_subjFit <- get_fpca_fits(abx_fclust)
save(list = c("keep_rsv", "abx.rsv.subset",
"abx_fclust", "abx_fclust_subjFit", "abx_fclust_mu"),
file = "output/fpca_clust_res_abx.rda")
We now find the taxa with the most variability in the response to ABX between two clusters:
# Taxa's trajectory difference between two clusters
diffBetweenClusts.abx <- abx_fclust_subjFit %>%
group_by(Feature_ID, time, Cluster ) %>%
summarise(value = mean(value, na.rm = TRUE)) %>%
spread(key = "Cluster", value = "value") %>%
ungroup() %>%
mutate(diff = abs(`1` - `2`)) %>%
group_by(Feature_ID) %>%
summarise(
mean_clustDist_L1 = mean(diff, na.rm = TRUE),
sd_Clust1 = sd(`1`),
sd_Clust2 = sd(`2`)) %>%
filter(
!is.na(mean_clustDist_L1)
) %>% # Some taxa have a single cluster (everyone behaves similarly)
arrange(-mean_clustDist_L1)
summary(diffBetweenClusts.abx)
Feature_ID mean_clustDist_L1 sd_Clust1 sd_Clust2
Length:228 Min. :0.2363 Min. :0.02904 Min. :0.01393
Class :character 1st Qu.:1.2631 1st Qu.:0.20271 1st Qu.:0.20727
Mode :character Median :1.9057 Median :0.36080 Median :0.36572
Mean :1.9448 Mean :0.57786 Mean :0.59330
3rd Qu.:2.5430 3rd Qu.:0.62558 3rd Qu.:0.75281
Max. :4.6640 Max. :3.68735 Max. :3.22498
diffBetweenClusts.abx <- diffBetweenClusts.abx %>%
filter(sd_Clust1 > median(diffBetweenClusts.abx$sd_Clust1) |
sd_Clust2 > median(diffBetweenClusts.abx$sd_Clust2))
# Plot top 20
seq_to_plot <- diffBetweenClusts.abx$Feature_ID[1:20]
tax_to_plot <- (TAXTAB %>% column_to_rownames("Seq_ID"))[seq_to_plot, "OrgName"]
tax_to_plot
[1] "Seq4: Prevotella NA" "Seq3: Faecalibacterium prausnitzii"
[3] "Seq59: Bacteroides NA" "Seq68: Bacteroides NA"
[5] "Seq74: Clostridium_XVIII NA" "Seq11: Bacteroides vulgatus"
[7] "Seq7: Faecalibacterium prausnitzii" "Seq1: Faecalibacterium prausnitzii"
[9] "Seq25: Bacteroides NA" "Seq8: Bacteroides dorei"
[11] "Seq6: Faecalibacterium prausnitzii" "Seq37: Faecalibacterium prausnitzii"
[13] "Seq123: NA" "Seq28: Faecalibacterium prausnitzii"
[15] "Seq66: Blautia NA" "Seq12: NA"
[17] "Seq49: Bifidobacterium NA" "Seq30: Parabacteroides distasonis"
[19] "Seq18: Bacteroides uniformis" "Seq31: Clostridium_XVIII NA"
seq1_subjClusters <- abx_fclust_subjFit %>%
filter(Feature_ID == "Seq1") %>%
select(Replicate_ID, Cluster) %>% distinct() %>%
filter(Cluster==2)
abx2plot.fclust <- abx_fclust_subjFit %>%
left_join(TAXTAB, by = c("Feature_ID" = "Seq_ID")) %>%
group_by(Feature_ID) %>%
mutate(
n_cluster1 = sum(Cluster == 1),
n_cluster2 = sum(Cluster == 2),
majorityCluster = ifelse(n_cluster1 > n_cluster2, 1, 2)
) %>%
ungroup() %>%
mutate(
OrgName = factor(OrgName, levels = tax_to_plot),
Subject_Cluster = ifelse(Cluster == majorityCluster, "Majority", "Minority"))
abx2plot <- abx.rsv.subset %>%
filter(Seq_ID %in% seq_to_plot) %>%
left_join(
abx2plot.fclust %>%
select(-value, -time) %>%
distinct() %>%
rename(Seq_ID = Feature_ID, Subject = Replicate_ID)
) %>%
mutate(
Seq_ID = factor(Seq_ID, levels = seq_to_plot),
OrgName = factor(OrgName, levels = tax_to_plot))
Joining, by = c("Seq_ID", "Subject", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "OrgName")
Column `OrgName` joining character vector and factor, coercing into character vector
abx2plot.fclust %>%
filter(Feature_ID %in% seq_to_plot) %>%
ggplot(aes(x = time, y = value, color = Subject_Cluster)) +
geom_line(
aes(group = Replicate_ID),
alpha = 0.3, size = 0.5
) +
geom_smooth(se = FALSE) +
geom_point(
data = abx2plot,
aes(x = Abx_RelDay, y = abundance),
size = 0.3, alpha = 0.5
) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_line(
data = abx_fclust_mu %>%
filter(Feature_ID %in% seq_to_plot) %>%
left_join(TAXTAB, by = c("Feature_ID" = "Seq_ID")) %>%
mutate(OrgName = factor(OrgName, levels = tax_to_plot)),
color = "red", size = 1
) +
scale_color_manual(values = c( "grey17", "deepskyblue2")) +
facet_wrap(~ OrgName, scales = "free", ncol = 4) +
theme(strip.text = element_text(family="Helvetica-Narrow", size = 10)) +
scale_x_continuous(name = "Days from initial antibiotic dose",
breaks = seq(-40, 120, 20), labels = seq(-40, 120, 20),
limits = c(NA, NA))
Using mean response to Abx
seqClusters.abx <- fit_fpca(
abx_fclust_majority,
"time", "value", "Feature_ID",
cluster = TRUE, K = 8)
write.csv(seqClusters_fpca.abx %>% select(Seq_ID, Cluster) %>% distinct() %>%
left_join(TAXTAB) %>% arrange(Cluster),
file = "output/abx_seqClusters.csv")
Joining, by = "Seq_ID"
ggplot(
seqClusters_fpca.abx, aes(x = time, y = value)) +
geom_line(
aes(group = Seq_ID, color = Cluster),
size = 0.7, alpha = 0.7
) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 5, lwd = 1, color = "orange") +
geom_smooth(color = "grey20") +
facet_wrap(~ Cluster, labeller = label_both, ncol = 4) +
scale_color_brewer(palette = "Set1") +
guides(color = guide_legend(override.aes = list(size=4)))
top_rsv <- seqClusters_fpca.abx %>%
group_by(Seq_ID, Cluster) %>%
summarise(mean_abnd = mean(value)) %>%
arrange(desc(mean_abnd)) %>%
left_join(TAXTAB %>% select(Seq_ID, OrgName, Species, Genus))
Joining, by = "Seq_ID"
n = 6; nclust = length(unique(top_genus$Cluster))
df_top_genus <- top_genus %>%
group_by(Cluster) %>%
top_n(n, wt = prev*mean_abnd) %>%
arrange(Cluster, -prev, -mean_abnd) %>%
select(-mean_abnd, -prev) %>% ungroup() %>%
mutate(Cluster = paste0("Cluster", Cluster)) %>%
mutate(idx = rep(1:n, nclust)) %>%
spread(key = Cluster, Genus)
df_top_genus
bray_to_baseline_fltr %>%
filter(perturbation == "Diet", RelDay >= -30, RelDay <= 30) %>%
mutate(Interval = factor(Interval, level = names(diet_intv_cols))) %>%
ggplot(
aes(x = RelDay, y = dist_to_baseline,
group = Subject, color = Interval)) +
geom_line(aes(group = Subject), alpha = 0.7, lwd = 0.5) +
geom_point(alpha = 0.5, size = 1.2) +
scale_color_manual(values = diet_intv_cols) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(size=3))) +
xlab("Days from diet initiation") +
ylab("Bray-Curtis distance to 7 pre-diet samples")
fpca.bray.diet30 <- fit_dist_to_baseline(
bray_to_baseline_fltr %>% filter(perturbation == "Diet", RelDay >= -30, RelDay <= 30))
save(list = c("fpca.bray.abx", "fpca.bray.diet", "fpca.bray.diet30"), file = "output/fpca_res.rda")
(pDiet <- bray_to_baseline_fltr %>%
filter(perturbation == "Diet", RelDay >= -30, RelDay <= 30) %>%
ggplot(aes(x = RelDay, y = dist_to_baseline)) +
geom_line(
data = fpca.bray.diet30[["fitted"]],
aes(group = Subject, x = time, y = value),
alpha = 0.3, size = 0.7, color = "grey30") +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_point(aes(color = Interval), size = 1.5, alpha = 0.7) +
geom_line(
data = fpca.bray.diet30[["mean"]], aes(x = time, y = value),
color = "navy", size = 2) +
scale_color_manual(values = diet_intv_cols, name = "Interval") +
scale_x_continuous(
name = "Days from diet initiation",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
scale_y_continuous(
name = "Bray-Curtis distance to baseline",
limits = c(NA, NA), breaks = seq(0.1, 0.80, 0.1)) +
theme(text = element_text(size = 20)))
fpca.bray.diet30[["fitted"]] <- fpca.bray.diet30[["fitted"]] %>%
mutate(
Interval = ifelse(fpca.bray.diet30[["fitted"]]$time < 0 , "PreDiet",
ifelse(fpca.bray.diet30[["fitted"]]$time >= 0 & fpca.bray.diet30[["fitted"]]$time <= 4, "MidDiet",
"PostDiet")))
(pDietDeriv <- fpca.bray.diet30[["fitted"]] %>%
ggplot(aes(x = RelDay)) +
geom_line(
aes(group = Subject, x = time, y = deriv, color = Interval),
alpha = 0.5, size = 0.7) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
scale_color_manual(values = diet_intv_cols, name = "Interval") +
scale_x_continuous(name = "",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
ylab("Derivative"))
(pDietLab <- bray_to_baseline_fltr %>%
filter(perturbation == "Diet", RelDay >= -50, RelDay <= 60) %>%
ggplot(aes(x = RelDay, y = dist_to_baseline)) +
geom_line(
data = fpca.bray.diet[["fitted"]],
aes(group = Subject, x = time, y = value),
alpha = 0.3, size = 0.7 ) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_text(
aes(label = Subject, color = Interval), size = 4, alpha = 0.7) +
geom_line(
data = fpca.bray.diet[["mean"]], aes(x = time, y = value),
color = "navy", size = 2) +
scale_color_manual(values = diet_intv_cols, name = "Interval") +
scale_x_continuous(
name = "Days from diet initiation",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
scale_y_continuous(
name = "Bray-Curtis distance to baseline",
limits = c(0.1, 0.85), breaks = seq(0.1, 0.80, 0.1)) +
theme(text = element_text(size = 20)))
vp = grid::viewport(width = 0.3, height = 0.32, x = 0.07, y =0.95, just = c("left", "top"))
print(pDiet)
print(pDietDeriv + theme_bw(base_size = 10) + theme_subplot, vp = vp)
dietFac <- c("PreDiet", "MidDiet", "PostDiet")
diet <- subset_samples(psSubj, Diet_RelDay >= -30 & Diet_RelDay <= 30)
diet <- subset_taxa(diet, taxa_sums(diet) > 0)
diet
(replicated <- data.frame(sample_data(diet)) %>%
group_by(Subject, Group, Diet_Interval, Diet_RelDay) %>%
mutate(n = n()) %>% ungroup() %>%
filter(n > 1) %>% select(Meas_ID, Subject, Diet_RelDay, Diet_Interval))
# Remove replicated samples
sample_sums(diet)[replicated$Meas_ID]
# M7869 M7886
# 74697 74873
# we retain a replicate with higher sample depth:
diet <- subset_samples(diet, !Meas_ID %in% c("M2129"))
diet
# otu_table() OTU Table: [ 2333 taxa and 2611 samples ]
diet.rsv <- data.frame(asinh(as(otu_table(diet), "matrix"))) %>%
rownames_to_column("Seq_ID") %>%
gather(Meas_ID, abundance, -Seq_ID) %>%
left_join(SMP) %>%
left_join(TAXTAB) %>%
mutate(Diet_Interval = factor(Diet_Interval, levels = dietFac)) %>%
arrange(Seq_ID, Subject, Diet_RelDay)
thresh <- 0; num_nonzero <- 10;
min_no_subj <- 3
keep_rsv <- diet.rsv %>%
filter(abundance > thresh) %>%
group_by(Subject, Seq_ID) %>%
summarise(Freq = n()) %>% # No. of samples per subject w/ positive rsv count
filter(Freq >= num_nonzero) %>%
group_by(Seq_ID) %>%
summarise(Freq = n()) %>% # Number of subjects w/ num_nonzero samples with counts > thresh
filter(Freq >= min_no_subj) %>%
arrange(desc(Freq))
length(unique(keep_rsv$Seq_ID)) # = 754
diet.rsv.subset <- diet.rsv %>%
filter(Seq_ID %in% unique(keep_rsv$Seq_ID))
# save(list = c("keep_rsv", "diet.rsv.subset"),
# file = "output/fpca_clust_res_diet.rda")
diet_fclust <- fpca_wrapper(
diet.rsv.subset,
time_column = "Diet_RelDay",
value_column = "abundance",
replicate_column = "Subject",
feat_column = "Seq_ID",
cluster = TRUE,
clust_min_num_replicate = 15,
fpca_optns = NULL, fclust_optns = NULL,
parallel = TRUE, ncores = 16)
diet_fclust_mu <- get_fpca_means(diet_fclust)
diet_fclust_subjFit <- get_fpca_fits(diet_fclust)
save(list = c("keep_rsv", "diet.rsv.subset",
"diet_fclust", "diet_fclust_subjFit", "diet_fclust_mu"),
file = "output/fpca_clust_res_diet.rda")
We now find the taxa with the most variability in the response to Diet between two clusters:
# Taxa's trajectory difference between two clusters
load("output/fpca_clust_res_diet.rda")
diffBetweenClusts.diet <- diet_fclust_subjFit %>%
group_by(Feature_ID, time, Cluster ) %>%
summarise(value = mean(value, na.rm = TRUE)) %>%
spread(key = "Cluster", value = "value") %>%
ungroup() %>%
mutate(diff = abs(`1` - `2`)) %>%
group_by(Feature_ID) %>%
summarise(
mean_clustDist_L1 = mean(diff, na.rm = TRUE),
sd_Clust1 = sd(`1`),
sd_Clust2 = sd(`2`)) %>%
filter(
!is.na(mean_clustDist_L1)
) %>% # Some taxa have a single cluster (everyone behaves similarly)
arrange(-mean_clustDist_L1)
summary(diffBetweenClusts.diet)
Feature_ID mean_clustDist_L1 sd_Clust1 sd_Clust2
Length:178 Min. :0.07593 Min. :0.02711 Min. :0.03151
Class :character 1st Qu.:1.32221 1st Qu.:0.10793 1st Qu.:0.09215
Mode :character Median :1.82753 Median :0.19367 Median :0.18530
Mean :1.87661 Mean :0.25623 Mean :0.25712
3rd Qu.:2.37622 3rd Qu.:0.33470 3rd Qu.:0.33276
Max. :5.65328 Max. :1.49890 Max. :1.68105
diffBetweenClusts.diet <- diffBetweenClusts.diet %>%
filter(sd_Clust1 > median(diffBetweenClusts.diet$sd_Clust1) |
sd_Clust2 > median(diffBetweenClusts.diet$sd_Clust2))
# Plot top 20
seq_to_plot <- diffBetweenClusts.diet$Feature_ID[1:20]
tax_to_plot <- (TAXTAB %>% column_to_rownames("Seq_ID"))[seq_to_plot, "OrgName"]
tax_to_plot
[1] "Seq19: Bifidobacterium NA" "Seq3: Faecalibacterium prausnitzii"
[3] "Seq5: Bacteroides vulgatus" "Seq31: Clostridium_XVIII NA"
[5] "Seq29: Bacteroides NA" "Seq37: Faecalibacterium prausnitzii"
[7] "Seq159: Parasutterella excrementihominis" "Seq22: Bacteroides NA"
[9] "Seq8: Bacteroides dorei" "Seq204: Clostridium_XlVa NA"
[11] "Seq16: Akkermansia muciniphila" "Seq145: Ruminococcus2 NA"
[13] "Seq59: Bacteroides NA" "Seq79: Blautia NA"
[15] "Seq180: NA" "Seq101: Phascolarctobacterium faecium"
[17] "Seq2: Bacteroides vulgatus" "Seq32: Bacteroides ovatus"
[19] "Seq24: Dehalobacter NA" "Seq131: NA"
seq1_subjClusters <- diet_fclust_subjFit %>%
filter(Feature_ID == "Seq1") %>%
select(Replicate_ID, Cluster) %>% distinct() %>%
filter(Cluster==2)
diet2plot.fclust <- diet_fclust_subjFit %>%
left_join(TAXTAB, by = c("Feature_ID" = "Seq_ID")) %>%
group_by(Feature_ID) %>%
mutate(
n_cluster1 = sum(Cluster == 1),
n_cluster2 = sum(Cluster == 2),
majorityCluster = ifelse(n_cluster1 > n_cluster2, 1, 2)
) %>%
ungroup() %>%
mutate(
OrgName = factor(OrgName, levels = tax_to_plot),
Subject_Cluster = ifelse(Cluster == majorityCluster, "Majority", "Minority"))
diet2plot <- diet.rsv.subset %>%
filter(Seq_ID %in% seq_to_plot) %>%
left_join(
diet2plot.fclust %>%
select(-value, -time) %>%
distinct() %>%
rename(Seq_ID = Feature_ID, Subject = Replicate_ID)
) %>%
mutate(
Seq_ID = factor(Seq_ID, levels = seq_to_plot),
OrgName = factor(OrgName, levels = tax_to_plot))
Joining, by = c("Seq_ID", "Subject", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "OrgName")
Column `OrgName` joining character vector and factor, coercing into character vector
diet2plot.fclust %>%
filter(Feature_ID %in% seq_to_plot) %>%
ggplot(aes(x = time, y = value, color = Subject_Cluster)) +
geom_line(
aes(group = Replicate_ID),
alpha = 0.3, size = 0.5
) +
geom_smooth(se = FALSE) +
geom_point(
data = diet2plot,
aes(x = Diet_RelDay, y = abundance),
size = 0.3, alpha = 0.5
) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_line(
data = diet_fclust_mu %>%
filter(Feature_ID %in% seq_to_plot) %>%
left_join(TAXTAB, by = c("Feature_ID" = "Seq_ID")) %>%
mutate(OrgName = factor(OrgName, levels = tax_to_plot)),
color = "red", size = 1
) +
scale_color_manual(values = c( "grey17", "deepskyblue2")) +
facet_wrap(~ OrgName, scales = "free", ncol = 4) +
theme(strip.text = element_text(family="Helvetica-Narrow", size = 10)) +
scale_x_continuous(name = "Days from diet initiation",
breaks = seq(-40, 120, 20), labels = seq(-40, 120, 20),
limits = c(NA, NA))
Using mean response to Diet
diet_fclust_majority <- diet2plot.fclust %>%
filter(Subject_Cluster == "Majority") %>%
left_join(keep_rsv, by = c("Feature_ID" = "Seq_ID")) %>%
filter(Freq >= min_num_subj) %>%
select(-Freq) %>%
group_by(time, Feature_ID) %>%
summarise(value = mean(value)) %>%
left_join(TAXTAB, by = c("Feature_ID" = "Seq_ID")) %>%
arrange( Feature_ID, time)
length(unique(diet_fclust_majority$Feature_ID))
[1] 500
#[1] 500
set.seed(123456)
min_num_subj <- 5
seqClusters.diet <- fit_fpca(
diet_fclust_majority,
"time", "value", "Feature_ID",
cluster = TRUE, K = 6)
seqClusters_fpca.diet <- fitted_values_fpca(seqClusters.diet) %>%
mutate(Seq_ID = Replicate_ID)
write.csv(seqClusters_fpca.diet %>% select(Seq_ID, Cluster) %>% distinct() %>%
left_join(TAXTAB) %>% arrange(Cluster),
file = "output/diet_seqClusters.csv")
Joining, by = "Seq_ID"
ggplot(
seqClusters_fpca.diet, aes(x = time, y = value)) +
geom_line(
aes(group = Seq_ID, color = Cluster),
size = 0.7, alpha = 0.5
) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 5, lwd = 1, color = "orange") +
geom_smooth(color = "grey20") +
facet_wrap(~ Cluster, labeller = label_both, ncol = 3) +
scale_color_brewer(palette = "Set1") +
guides(color = guide_legend(override.aes = list(size=4)))
top_rsv <- seqClusters_fpca.diet %>%
group_by(Seq_ID, Cluster) %>%
summarise(mean_abnd = mean(value)) %>%
arrange(desc(mean_abnd)) %>%
left_join(TAXTAB %>% select(Seq_ID, OrgName, Species, Genus))
Joining, by = "Seq_ID"
top_genus <- top_rsv %>%
filter(!is.na(Genus)) %>%
group_by(Genus, Cluster) %>%
summarise(
mean_abnd = mean(mean_abnd),
prev = n()) %>%
arrange(Cluster, -prev, -mean_abnd)
n = 10; nclust = length(unique(top_rsv$Cluster))
df_top_rsv <- top_rsv %>%
ungroup() %>%
select(OrgName, Cluster, mean_abnd) %>%
group_by(Cluster) %>%
top_n(n, wt = mean_abnd) %>%
arrange(Cluster, mean_abnd) %>%
select(-mean_abnd) %>% ungroup() %>%
mutate(Cluster = paste0("Cluster", Cluster)) %>%
mutate(idx = rep(1:n, nclust)) %>%
spread(key = Cluster, OrgName)
df_top_rsv %>% select(-idx)
bray_to_baseline_fltr %>%
filter(perturbation == "CC", RelDay >= -50, RelDay <= 50) %>%
mutate(Interval = factor(Interval, level = names(cc_intv_cols))) %>%
ggplot(
aes(x = RelDay, y = dist_to_baseline,
group = Subject, color = Interval)) +
geom_line(aes(group = Subject), alpha = 0.7, lwd = 0.5) +
geom_point(alpha = 0.5, size = 1.2) +
scale_color_manual(values = cc_intv_cols) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(size=3))) +
xlab("Days from colon cleanout") +
ylab("Bray-Curtis distance to 7 pre-diet samples")
fpca.bray.cc30 <- fit_dist_to_baseline(
bray_to_baseline_fltr %>%
filter(perturbation == "CC", RelDay >= -30, RelDay <= 30) %>%
arrange(Subject, RelDay))
save(list = c("fpca.bray.abx", "fpca.bray.diet","fpca.bray.diet30",
"fpca.bray.cc", "fpca.bray.cc30"),
file = "output/fpca_res.rda")
(pCC <- bray_to_baseline_fltr %>%
filter(perturbation == "CC", RelDay >= -30, RelDay <= 30) %>%
ggplot(aes(x = RelDay, y = dist_to_baseline)) +
geom_line(
data = fpca.bray.cc30[["fitted"]],
aes(group = Subject, x = time, y = value),
alpha = 0.3, size = 0.7, color = "grey30") +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_point(aes(color = Interval), size = 1.5, alpha = 0.7) +
geom_line(
data = fpca.bray.cc30[["mean"]], aes(x = time, y = value),
color = "navy", size = 2) +
scale_color_manual(values = cc_intv_cols, name = "Interval") +
scale_x_continuous(
name = "Days from colon cleanout",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
scale_y_continuous(
name = "Bray-Curtis distance to baseline",
limits = c(NA, NA), breaks = seq(0.1, 0.80, 0.1)) +
theme(text = element_text(size = 20)))
fpca.bray.cc30[["fitted"]] <- fpca.bray.cc30[["fitted"]] %>%
mutate(Interval = ifelse(fpca.bray.cc30[["fitted"]]$time < 0 , "PreCC","PostCC"))
(pCCDeriv <- fpca.bray.cc30[["fitted"]] %>%
ggplot(aes(x = RelDay)) +
geom_line(
aes(group = Subject, x = time, y = deriv, color = Interval),
alpha = 0.5, size = 0.7) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
scale_color_manual(values = cc_intv_cols, name = "Interval") +
scale_x_continuous(name = "",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
ylab("Derivative"))
(pCCLab <- bray_to_baseline_fltr %>%
filter(perturbation == "CC", RelDay >= -50, RelDay <= 50) %>%
ggplot(aes(x = RelDay, y = dist_to_baseline)) +
geom_line(
data = fpca.bray.cc[["fitted"]],
aes(group = Subject, x = time, y = value),
alpha = 0.3, size = 0.7 ) +
geom_vline(xintercept = 0, lwd = 1, color = "orange") +
geom_vline(xintercept = 4, lwd = 1, color = "orange") +
geom_text(
aes(label = Subject, color = Interval), size = 4, alpha = 0.7) +
geom_line(
data = fpca.bray.cc[["mean"]], aes(x = time, y = value),
color = "navy", size = 2) +
scale_color_manual(values = cc_intv_cols, name = "Interval") +
scale_x_continuous(
name = "Days from colon cleanout",
limits = c(NA, NA), breaks = seq(-50, 60, 10)) +
scale_y_continuous(
name = "Bray-Curtis distance to baseline",
limits = c(0.1, 0.85), breaks = seq(0.1, 0.80, 0.1)) +
theme(text = element_text(size = 20)))
vp = grid::viewport(width = 0.3, height = 0.32, x = 0.07, y =0.95, just = c("left", "top"))
print(pCC)
print(pCCDeriv + theme_bw(base_size = 10) + theme_subplot, vp = vp)
sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /share/software/user/open/openblas/0.2.19/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] fdapace_0.4.1 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
[6] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0 tidyverse_1.2.1.9000
[11] RColorBrewer_1.1-2 phyloseq_1.26.1
loaded via a namespace (and not attached):
[1] nlme_3.1-140 fs_1.3.1 lubridate_1.7.4 httr_1.4.0 numDeriv_2016.8-1.1
[6] tools_3.5.1 backports_1.1.4 R6_2.4.0 vegan_2.5-5 rpart_4.1-15
[11] Hmisc_4.2-0 DBI_1.0.0 lazyeval_0.2.2 BiocGenerics_0.28.0 mgcv_1.8-28
[16] colorspace_1.4-1 nnet_7.3-12 permute_0.9-5 ade4_1.7-13 withr_2.1.2
[21] gridExtra_2.3 tidyselect_0.2.5 compiler_3.5.1 cli_1.1.0 rvest_0.3.4
[26] Biobase_2.42.0 htmlTable_1.13.1 xml2_1.2.0 labeling_0.3 checkmate_1.9.4
[31] scales_1.0.0 digest_0.6.20 foreign_0.8-71 rmarkdown_1.14 XVector_0.22.0
[36] base64enc_0.1-3 pkgconfig_2.0.2 htmltools_0.3.6 dbplyr_1.4.2 htmlwidgets_1.3
[41] rlang_0.4.0 readxl_1.3.1 rstudioapi_0.10 generics_0.0.2 jsonlite_1.6
[46] acepack_1.4.1 magrittr_1.5 Formula_1.2-3 biomformat_1.10.1 Matrix_1.2-17
[51] Rcpp_1.0.1 munsell_0.5.0 S4Vectors_0.20.1 Rhdf5lib_1.4.3 ape_5.3
[56] stringi_1.4.3 yaml_2.2.0 MASS_7.3-51.4 zlibbioc_1.28.0 rhdf5_2.26.2
[61] plyr_1.8.4 grid_3.5.1 parallel_3.5.1 crayon_1.3.4 lattice_0.20-38
[66] Biostrings_2.50.2 haven_2.1.1 splines_3.5.1 multtest_2.38.0 hms_0.5.0
[71] zeallot_0.1.0 knitr_1.23 pillar_1.4.2 igraph_1.2.4.1 reshape2_1.4.3
[76] codetools_0.2-16 stats4_3.5.1 reprex_0.3.0 glue_1.3.1 evaluate_0.14
[81] latticeExtra_0.6-28 data.table_1.12.2 modelr_0.1.4 vctrs_0.2.0 foreach_1.4.4
[86] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1 xfun_0.8 broom_0.5.2
[91] pracma_2.2.5 survival_2.44-1.1 iterators_1.0.10 IRanges_2.16.0 cluster_2.1.0